// sudo apt install liburing-dev

#if 0
// io_uring(7)                                   Linux Programmer's Manual                                   io_uring(7)
// 
// NAME
//        io_uring - Asynchronous I/O facility
// 
// SYNOPSIS
//        #include <linux/io_uring.h>
// 
// DESCRIPTION
//        io_uring is a Linux-specific API for asynchronous I/O.  It allows the user to submit one or more I/O requests,
//        which are processed asynchronously without blocking the calling process.  io_uring gets  its  name  from  ring
//        buffers which are shared between user space and kernel space. This arrangement allows for efficient I/O, while
//        avoiding the overhead of copying buffers between them, where possible.  This interface makes io_uring  differ‐
//        ent  from other UNIX I/O APIs, wherein, rather than just communicate between kernel and user space with system
//        calls, ring buffers are used as the main mode of communication.  This arrangement has various performance ben‐
//        efits  which  are  discussed in a separate section below.  This man page uses the terms shared buffers, shared
//        ring buffers and queues interchangeably.
// 
//        The general programming model you need to follow for io_uring is outlined below
// 
//        •      Set up shared buffers with io_uring_setup(2) and mmap(2), mapping into user space  shared  buffers  for
//               the  submission  queue  (SQ) and the completion queue (CQ).  You place I/O requests you want to make on
//               the SQ, while the kernel places the results of those operations on the CQ.
// 
//        •      For every I/O request you need to make (like to read a file, write a file, accept a socket  connection,
//               etc),  you create a submission queue entry, or SQE, describe the I/O operation you need to get done and
//               add it to the tail of the submission queue (SQ).  Each I/O operation is, in essence, the equivalent  of
//               a system call you would have made otherwise, if you were not using io_uring.  You can add more than one
//               SQE to the queue depending on the number of operations you want to request.
// 
//        •      After you add one or more SQEs, you need to call io_uring_enter(2) to tell the kernel to  dequeue  your
//               I/O requests off the SQ and begin processing them.
// 
//        •      For  each  SQE you submit, once it is done processing the request, the kernel places a completion queue
//               event or CQE at the tail of the completion queue or CQ.  The kernel places exactly one matching CQE  in
//               the  CQ  for  every SQE you submit on the SQ.  After you retrieve a CQE, minimally, you might be inter‐
//               ested in checking the res field of the CQE structure, which corresponds to the return value of the sys‐
//               tem call's equivalent, had you used it directly without using io_uring.  For instance, a read operation
//               under io_uring, started with the IORING_OP_READ operation, issues the equivalent of the read(2)  system
//               call.  In practice, it mixes the semantics of pread(2) and preadv2(2) in that it takes an explicit off‐
//               set, and supports using -1 for the offset to indicate that the current file position should be used in‐
//               stead  of  passing  in  an  explicit  offset. See the opcode documentation for more details. Given that
//               io_uring is an async interface, errno is never used for passing back error  information.  Instead,  res
//               will contain what the equivalent system call would have returned in case of success, and in case of er‐
//               ror res will contain -errno .  For example, if the normal read system call would have returned  -1  and
//               set errno to EINVAL , then res would contain -EINVAL .  If the normal system call would have returned a
//               read size of 1024, then res would contain 1024.
// 
//        •      Optionally, io_uring_enter(2) can also wait for a specified number of requests to be processed  by  the
//               kernel  before  it  returns.   If you specified a certain number of completions to wait for, the kernel
//               would have placed at least those many number of CQEs on the CQ, which you can then readily read,  right
//               after the return from io_uring_enter(2).
// 
//        •      It is important to remember that I/O requests submitted to the kernel can complete in any order.  It is
//               not necessary for the kernel to process one request after another, in the order you placed them.  Given
//               that  the interface is a ring, the requests are attempted in order, however that doesn't imply any sort
//               of ordering on their completion.  When more than one request is in flight, it is not possible to deter‐
//               mine  which  one  will complete first.  When you dequeue CQEs off the CQ, you should always check which
//               submitted request it corresponds to.  The most common method for doing so is  utilizing  the  user_data
//               field in the request, which is passed back on the completion side.
// 
//        Adding to and reading from the queues:
// 
//        •      You add SQEs to the tail of the SQ.  The kernel reads SQEs off the head of the queue.
// 
//        •      The kernel adds CQEs to the tail of the CQ.  You read CQEs off the head of the queue.
// 
//    Submission queue polling
//        One  of  the  goals  of  io_uring  is  to provide a means for efficient I/O.  To this end, io_uring supports a
//        polling mode that lets you avoid the call to io_uring_enter(2), which you use to inform the  kernel  that  you
//        have  queued  SQEs  on  to the SQ.  With SQ Polling, io_uring starts a kernel thread that polls the submission
//        queue for any I/O requests you submit by adding SQEs.  With SQ Polling enabled, there is no need  for  you  to
//        call  io_uring_enter(2),  letting you avoid the overhead of system calls.  A designated kernel thread dequeues
//        SQEs off the SQ as you add them and dispatches them for asynchronous processing.
// 
//    Setting up io_uring
//        The main steps in setting up io_uring consist of mapping in the shared buffers with mmap(2) calls.  In the ex‐
//        ample  program  included  in this man page, the function app_setup_uring() sets up io_uring with a QUEUE_DEPTH
//        deep submission queue.  Pay attention to the 2 mmap(2) calls that set up the shared submission and  completion
//        queues.  If your kernel is older than version 5.4, three mmap(2) calls are required.
// 
//    Submitting I/O requests
//        The  process  of  submitting  a request consists of describing the I/O operation you need to get done using an
//        io_uring_sqe structure instance.  These details describe the equivalent system call and its  parameters.   Be‐
//        cause  the  range  of I/O operations Linux supports are very varied and the io_uring_sqe structure needs to be
//        able to describe them, it has several fields, some packed into unions for space efficiency.  Here is a simpli‐
//        fied version of struct io_uring_sqe with some of the most often used fields:
// 
//            struct io_uring_sqe {
//                    __u8    opcode;         /* type of operation for this sqe */
//                    __s32   fd;             /* file descriptor to do IO on */
//                    __u64   off;            /* offset into file */
//                    __u64   addr;           /* pointer to buffer or iovecs */
//                    __u32   len;            /* buffer size or number of iovecs */
//                    __u64   user_data;      /* data to be passed back at completion time */
//                    __u8    flags;          /* IOSQE_ flags */
//                    ...
//            };
// 
//        Here is struct io_uring_sqe in full:
// 
//            struct io_uring_sqe {
//                    __u8    opcode;         /* type of operation for this sqe */
//                    __u8    flags;          /* IOSQE_ flags */
//                    __u16   ioprio;         /* ioprio for the request */
//                    __s32   fd;             /* file descriptor to do IO on */
//                    union {
//                            __u64   off;    /* offset into file */
//                            __u64   addr2;
//                    };
//                    union {
//                            __u64   addr;   /* pointer to buffer or iovecs */
//                            __u64   splice_off_in;
//                    };
//                    __u32   len;            /* buffer size or number of iovecs */
//                    union {
//                            __kernel_rwf_t  rw_flags;
//                            __u32           fsync_flags;
//                            __u16           poll_events;    /* compatibility */
//                            __u32           poll32_events;  /* word-reversed for BE */
//                            __u32           sync_range_flags;
//                            __u32           msg_flags;
//                            __u32           timeout_flags;
//                            __u32           accept_flags;
//                            __u32           cancel_flags;
//                            __u32           open_flags;
//                            __u32           statx_flags;
//                            __u32           fadvise_advice;
//                            __u32           splice_flags;
//                    };
//                    __u64   user_data;      /* data to be passed back at completion time */
//                    union {
//                            struct {
//                                    /* pack this to avoid bogus arm OABI complaints */
//                                    union {
//                                            /* index into fixed buffers, if used */
//                                            __u16   buf_index;
//                                            /* for grouped buffer selection */
//                                            __u16   buf_group;
//                                    } __attribute__((packed));
//                                    /* personality to use, if used */
//                                    __u16   personality;
//                                    __s32   splice_fd_in;
//                            };
//                            __u64   __pad2[3];
//                    };
//            };
// 
//        To  submit  an I/O request to io_uring, you need to acquire a submission queue entry (SQE) from the submission
//        queue (SQ), fill it up with details of the operation you want to submit and call io_uring_enter(2).  There are
//        helper  functions  of the form io_uring_prep_X to enable proper setup of the SQE. If you want to avoid calling
//        io_uring_enter(2), you have the option of setting up Submission Queue Polling.
// 
//        SQEs are added to the tail of the submission queue.  The kernel picks up SQEs off the head  of  the  SQ.   The
//        general algorithm to get the next available SQE and update the tail is as follows.
// 
//            struct io_uring_sqe *sqe;
//            unsigned tail, index;
//            tail = *sqring->tail;
//            index = tail & (*sqring->ring_mask);
//            sqe = &sqring->sqes[index];
//            /* fill up details about this I/O request */
//            describe_io(sqe);
//            /* fill the sqe index into the SQ ring array */
//            sqring->array[index] = index;
//            tail++;
//            atomic_store_release(sqring->tail, tail);
// 
//        To get the index of an entry, the application must mask the current tail index with the size mask of the ring.
//        This holds true for both SQs and CQs.  Once the SQE is acquired, the necessary fields are filled in,  describ‐
//        ing  the request.  While the CQ ring directly indexes the shared array of CQEs, the submission side has an in‐
//        direction array between them.  The submission side ring buffer is an index into this array, which in turn con‐
//        tains the index into the SQEs.
// 
//        The following code snippet demonstrates how a read operation, an equivalent of a preadv2(2) system call is de‐
//        scribed by filling up an SQE with the necessary parameters.
// 
//            struct iovec iovecs[16];
//             ...
//            sqe->opcode = IORING_OP_READV;
//            sqe->fd = fd;
//            sqe->addr = (unsigned long) iovecs;
//            sqe->len = 16;
//            sqe->off = offset;
//            sqe->flags = 0;
// 
//        Memory ordering
//               Modern compilers and CPUs freely reorder reads and writes without affecting the  program's  outcome  to
//               optimize  performance.   Some aspects of this need to be kept in mind on SMP systems since io_uring in‐
//               volves buffers shared between kernel and user space.  These buffers are  both  visible  and  modifiable
//               from kernel and user space.  As heads and tails belonging to these shared buffers are updated by kernel
//               and user space, changes need to be coherently visible on either side, irrespective  of  whether  a  CPU
//               switch  took  place after the kernel-user mode switch happened.  We use memory barriers to enforce this
//               coherency.  Being significantly large subjects on their own, memory barriers are out of scope for  fur‐
//               ther discussion on this man page.
// 
//        Letting the kernel know about I/O submissions
//               Once you place one or more SQEs on to the SQ, you need to let the kernel know that you've done so.  You
//               can do this by calling the io_uring_enter(2) system call.  This system call is also capable of  waiting
//               for  a  specified  count of events to complete.  This way, you can be sure to find completion events in
//               the completion queue without having to poll it for events later.
// 
//    Reading completion events
//        Similar to the submission queue (SQ), the completion queue (CQ) is a shared buffer between the kernel and user
//        space.   Whereas  you  placed submission queue entries on the tail of the SQ and the kernel read off the head,
//        when it comes to the CQ, the kernel places completion queue events or CQEs on the tail of the CQ and you  read
//        off its head.
// 
//        Submission  is  flexible (and thus a bit more complicated) since it needs to be able to encode different types
//        of system calls that take various parameters.  Completion, on the other hand is simpler  since  we're  looking
//        only  for  a  return value back from the kernel.  This is easily understood by looking at the completion queue
//        event structure, struct io_uring_cqe:
// 
//            struct io_uring_cqe {
//                 __u64     user_data;  /* sqe->data submission passed back */
//                 __s32     res;        /* result code for this event */
//                 __u32     flags;
//            };
// 
//        Here, user_data is custom data that is passed unchanged from submission to completion.  That is, from SQEs  to
//        CQEs.  This field can be used to set context, uniquely identifying submissions that got completed.  Given that
//        I/O requests can complete in any order, this field can be used to correlate a submission  with  a  completion.
//        res is the result from the system call that was performed as part of the submission; its return value.
// 
//        The flags field carries request-specific information. As of the 6.0 kernel, the following flags are defined:
// 
//        IORING_CQE_F_BUFFER
//               If  set,  the  upper 16 bits of the flags field carries the buffer ID that was chosen for this request.
//               The request must have been issued with IOSQE_BUFFER_SELECT set, and used with a request type that  sup‐
//               ports  buffer  selection.  Additionally,  buffers  must  have been provided upfront either via the IOR‐
//               ING_OP_PROVIDE_BUFFERS or the IORING_REGISTER_PBUF_RING methods.
// 
//        IORING_CQE_F_MORE
//               If set, the application should expect more completions from the request. This is used for requests that
//               can generate multiple completions, such as multi-shot requests, receive, or accept.
// 
//        IORING_CQE_F_SOCK_NONEMPTY
//               If  set, upon receiving the data from the socket in the current request, the socket still had data left
//               on completion of this request.
// 
//        IORING_CQE_F_NOTIF
//               Set for notification CQEs, as seen with the zero-copy networking send and receive support.
// 
//        The general sequence to read completion events off the completion queue is as follows:
// 
//            unsigned head;
//            head = *cqring->head;
//            if (head != atomic_load_acquire(cqring->tail)) {
//                struct io_uring_cqe *cqe;
//                unsigned index;
//                index = head & (cqring->mask);
//                cqe = &cqring->cqes[index];
//                /* process completed CQE */
//                process_cqe(cqe);
//                /* CQE consumption complete */
//                head++;
//            }
//            atomic_store_release(cqring->head, head);
// 
//        It helps to be reminded that the kernel adds CQEs to the tail of the CQ, while you need to  dequeue  them  off
//        the head.  To get the index of an entry at the head, the application must mask the current head index with the
//        size mask of the ring.  Once the CQE has been consumed or processed, the head needs to be updated  to  reflect
//        the consumption of the CQE.  Attention should be paid to the read and write barriers to ensure successful read
//        and update of the head.
// 
//    io_uring performance
//        Because of the shared ring buffers between kernel and user space, io_uring can be a zero-copy system.  Copying
//        buffers  to  and from becomes necessary when system calls that transfer data between kernel and user space are
//        involved.  But since the bulk of the communication in io_uring is via buffers shared between  the  kernel  and
//        user space, this huge performance overhead is completely avoided.
// 
//        While system calls may not seem like a significant overhead, in high performance applications, making a lot of
//        them will begin to matter.  While workarounds the operating system has in place to deal with Spectre and Melt‐
//        down  are ideally best done away with, unfortunately, some of these workarounds are around the system call in‐
//        terface, making system calls not as cheap as before on affected hardware.  While  newer  hardware  should  not
//        need these workarounds, hardware with these vulnerabilities can be expected to be in the wild for a long time.
//        While using synchronous programming interfaces or even when using asynchronous  programming  interfaces  under
//        Linux,  there  is  at  least  one system call involved in the submission of each request.  In io_uring, on the
//        other hand, you can batch several requests in one go, simply by queueing up multiple SQEs, each describing  an
//        I/O operation you want and make a single call to io_uring_enter(2).  This is possible due to io_uring's shared
//        buffers based design.
// 
//        While this batching in itself can avoid the overhead associated with potentially multiple and frequent  system
//        calls,  you can reduce even this overhead further with Submission Queue Polling, by having the kernel poll and
//        pick up your SQEs for processing as you add them to the submission queue. This  avoids  the  io_uring_enter(2)
//        call  you need to make to tell the kernel to pick SQEs up.  For high-performance applications, this means even
//        fewer system call overheads.
// 
// CONFORMING TO
//        io_uring is Linux-specific.
// 
// EXAMPLES
//        The following example uses io_uring to copy stdin to stdout.  Using shell redirection, you should be  able  to
//        copy  files with this example.  Because it uses a queue depth of only one, this example processes I/O requests
//        one after the other.  It is purposefully kept this way to aid understanding.  In real-world scenarios however,
//        you'll  want to have a larger queue depth to parallelize I/O request processing so as to gain the kind of per‐
//        formance benefits io_uring provides with its asynchronous processing of requests.
#endif

#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <linux/fs.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <stdatomic.h>

#include <linux/io_uring.h>

#define QUEUE_DEPTH 1
#define BLOCK_SZ    1024

/* Macros for barriers needed by io_uring */
#define io_uring_smp_store_release(p, v)            \
    atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \
                  memory_order_release)
#define io_uring_smp_load_acquire(p)                \
    atomic_load_explicit((_Atomic typeof(*(p)) *)(p),   \
                 memory_order_acquire)

int ring_fd;
unsigned *sring_tail, *sring_mask, *sring_array,
            *cring_head, *cring_tail, *cring_mask;
struct io_uring_sqe *sqes;
struct io_uring_cqe *cqes;
char buff[BLOCK_SZ];
off_t offset;

/*
 * System call wrappers provided since glibc does not yet
 * provide wrappers for io_uring system calls.
* */

int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
    return (int) syscall(__NR_io_uring_setup, entries, p);
}

int io_uring_enter(int ring_fd, unsigned int to_submit,
                   unsigned int min_complete, unsigned int flags)
{
    return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit,
                min_complete, flags, NULL, 0);
}

int app_setup_uring(void) {
    struct io_uring_params p;
    void *sq_ptr, *cq_ptr;

    /* See io_uring_setup(2) for io_uring_params.flags you can set */
    memset(&p, 0, sizeof(p));
    ring_fd = io_uring_setup(QUEUE_DEPTH, &p);
    if (ring_fd < 0) {
        perror("io_uring_setup");
        return 1;
    }

    /*
     * io_uring communication happens via 2 shared kernel-user space ring
     * buffers, which can be jointly mapped with a single mmap() call in
     * kernels >= 5.4.
     */

    int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned);
    int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);

    /* Rather than check for kernel version, the recommended way is to
     * check the features field of the io_uring_params structure, which is a
     * bitmask. If IORING_FEAT_SINGLE_MMAP is set, we can do away with the
     * second mmap() call to map in the completion ring separately.
     */
    if (p.features & IORING_FEAT_SINGLE_MMAP) {
        if (cring_sz > sring_sz)
            sring_sz = cring_sz;
        cring_sz = sring_sz;
    }

    /* Map in the submission and completion queue ring buffers.
     *  Kernels < 5.4 only map in the submission queue, though.
     */
    sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE,
                  MAP_SHARED | MAP_POPULATE,
                  ring_fd, IORING_OFF_SQ_RING);
    if (sq_ptr == MAP_FAILED) {
        perror("mmap");
        return 1;
    }

    if (p.features & IORING_FEAT_SINGLE_MMAP) {
        cq_ptr = sq_ptr;
    } else {
        /* Map in the completion queue ring buffer in older kernels separately */
        cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE,
                      MAP_SHARED | MAP_POPULATE,
                      ring_fd, IORING_OFF_CQ_RING);
        if (cq_ptr == MAP_FAILED) {
            perror("mmap");
            return 1;
        }
    }
    /* Save useful fields for later easy reference */
    sring_tail = sq_ptr + p.sq_off.tail;
    sring_mask = sq_ptr + p.sq_off.ring_mask;
    sring_array = sq_ptr + p.sq_off.array;

    /* Map in the submission queue entries array */
    sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
                   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
                   ring_fd, IORING_OFF_SQES);
    if (sqes == MAP_FAILED) {
        perror("mmap");
        return 1;
    }

    /* Save useful fields for later easy reference */
    cring_head = cq_ptr + p.cq_off.head;
    cring_tail = cq_ptr + p.cq_off.tail;
    cring_mask = cq_ptr + p.cq_off.ring_mask;
    cqes = cq_ptr + p.cq_off.cqes;

    return 0;
}

/*
* Read from completion queue.
* In this function, we read completion events from the completion queue.
* We dequeue the CQE, update and head and return the result of the operation.
* */

int read_from_cq() {
    struct io_uring_cqe *cqe;
    unsigned head;

    /* Read barrier */
    head = io_uring_smp_load_acquire(cring_head);
    /*
    * Remember, this is a ring buffer. If head == tail, it means that the
    * buffer is empty.
    * */
    if (head == *cring_tail)
        return -1;

    /* Get the entry */
    cqe = &cqes[head & (*cring_mask)];
    if (cqe->res < 0)
        fprintf(stderr, "Error: %s\n", strerror(abs(cqe->res)));

    head++;

    /* Write barrier so that update to the head are made visible */
    io_uring_smp_store_release(cring_head, head);

    return cqe->res;
}

/*
* Submit a read or a write request to the submission queue.
* */

int submit_to_sq(int fd, int op) {
    unsigned index, tail;

    /* Add our submission queue entry to the tail of the SQE ring buffer */
    tail = *sring_tail;
    index = tail & *sring_mask;
    struct io_uring_sqe *sqe = &sqes[index];
    /* Fill in the parameters required for the read or write operation */
    sqe->opcode = op;
    sqe->fd = fd;
    sqe->addr = (unsigned long) buff;
    if (op == IORING_OP_READ) {
        memset(buff, 0, sizeof(buff));
        sqe->len = BLOCK_SZ;
    }
    else {
        sqe->len = strlen(buff);
    }
    sqe->off = offset;

    sring_array[index] = index;
    tail++;

    /* Update the tail */
    io_uring_smp_store_release(sring_tail, tail);

    /*
    * Tell the kernel we have submitted events with the io_uring_enter()
    * system call. We also pass in the IOURING_ENTER_GETEVENTS flag which
    * causes the io_uring_enter() call to wait until min_complete
    * (the 3rd param) events complete.
    * */
    int ret =  io_uring_enter(ring_fd, 1,1,
                              IORING_ENTER_GETEVENTS);
    if(ret < 0) {
        perror("io_uring_enter");
        return -1;
    }

    return ret;
}

int main(int argc, char *argv[]) {
    int res;

    /* Setup io_uring for use */
    if(app_setup_uring()) {
        fprintf(stderr, "Unable to setup uring!\n");
        return 1;
    }

    /*
    * A while loop that reads from stdin and writes to stdout.
    * Breaks on EOF.
    */
    while (1) {
        /* Initiate read from stdin and wait for it to complete */
        submit_to_sq(STDIN_FILENO, IORING_OP_READ);
        /* Read completion queue entry */
        res = read_from_cq();
        if (res > 0) {
            /* Read successful. Write to stdout. */
            submit_to_sq(STDOUT_FILENO, IORING_OP_WRITE);
            read_from_cq();
        } else if (res == 0) {
            /* reached EOF */
            break;
        }
        else if (res < 0) {
            /* Error reading file */
            fprintf(stderr, "Error: %s\n", strerror(abs(res)));
            break;
        }
        offset += res;
    }

    return 0;
}

